### replications script for  
# Two Americas of Well-Being: Divergent Rural-Urban Patterns of Life Satisfaction and Happiness from 2.6 B Social Media Posts}, 
# https://arxiv.org/abs/2511.10542 



rm(list=ls())

library(data.table)
library(knitr)
library(ggplot2)
library(scales)
library(margins)
library(kableExtra)
library(tidycensus)
library(xtable)

x_wide <- fread("x_wide.csv")
### start Table 1: Correlation between Life Satisfaction and Happiness by Year

corr_by_year <- x_wide[
  , {
    d <- .SD[!is.na(lifesat) & !is.na(happiness)]
    if (nrow(d) >= 3) {
      ct <- cor.test(d$lifesat, d$happiness)
      list(
        N    = nrow(d),
        r    = unname(ct$estimate),
        lo95 = unname(ct$conf.int[1]),
        hi95 = unname(ct$conf.int[2]),
        pval = ct$p.value
      )
    } else {
      list(N = nrow(d), r = NA_real_, lo95 = NA_real_, hi95 = NA_real_, pval = NA_real_)
    }
  },
  by = year
]

corr_by_year[order(year)]
tab <- corr_by_year[order(year)][!is.na(r)]
tab[, cor_ci := sprintf("%.3f [%.3f, %.3f]", r, lo95, hi95)]
tab[, pval_fmt := ifelse(pval < 1e-3, formatC(pval, format = "e", digits = 2),
                         sprintf("%.3f", pval))]
final_tab <- tab[, .(Year = year,  N,   `Correlation (95% CI)` = cor_ci,
                     `p-value` = pval_fmt)]


kable(final_tab, format = "latex", booktabs = TRUE,
      caption = "Correlation between Life Satisfaction and Happiness by Year")

### end Table 1: Correlation between Life Satisfaction and Happiness by Year



### start Figure 1: Relationship between Twitter-based life satisfaction and happiness across U.S. counties in 2020.

selected_year <- 2020
df <- as.data.table(x_wide)[year == selected_year, .(lifesat, happiness, rural)]
df <- df[is.finite(lifesat) & is.finite(happiness) & is.finite(rural)]
rural_levels <- sort(unique(df$rural))
df[, rural_f := factor(rural, levels = rural_levels)]
pal <- hue_pal()(length(rural_levels)); names(pal) <- as.character(rural_levels)
lims <- df[, .(
  xlo = quantile(lifesat,   0.01, na.rm = TRUE),
  xhi = quantile(lifesat,   0.99, na.rm = TRUE),
  ylo = quantile(happiness, 0.01, na.rm = TRUE),
  yhi = quantile(happiness, 0.99, na.rm = TRUE)
)]
ct <- cor.test(df$lifesat, df$happiness, use = "pairwise.complete.obs")
r  <- unname(ct$estimate)
p  <- ct$p.value
N  <- comma(sum(complete.cases(df[, .(lifesat, happiness)])))
p_txt <- ifelse(p < 0.001, "< 0.001", paste0("= ", format(round(p, 3), nsmall = 3)))
label_txt <- sprintf("Pearson r = %.2f (p %s)\nN = %s", r, p_txt, N)
x_ann <- lims$xlo + 0.02 * (lims$xhi - lims$xlo)
y_ann <- lims$yhi + 0.3 #* (lims$yhi - lims$ylo)

ggplot(df, aes(lifesat, happiness, color = rural_f)) +
  geom_point(alpha = 0.25, size = 1.1, shape = 16) +   # round dots
  geom_smooth(method = "lm", se = TRUE, color = "black",
              linewidth = 0.8, linetype = "dashed") +
  coord_cartesian(xlim = c(lims$xlo, lims$xhi),
                  ylim = c(lims$ylo, lims$yhi)) +
  coord_equal() +
  scale_color_manual(values = pal, name = "Rural code", drop = FALSE) +
  guides(color = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 16))) +
  annotate("label", x = x_ann, y = y_ann, label = label_txt,
           hjust = 0, vjust = 1, size = 4, label.size = 0,
           fill = "white", alpha = 0.75) +
  labs(
    title = sprintf("Life Satisfaction vs Happiness by Rural (%d)",selected_year),
    x = "Life satisfaction",
    y = "Happiness"
  ) +
  theme_minimal(base_size = 14) +
  theme(panel.grid.minor = element_blank(), aspect.ratio = 1)

### end Figure 1: Relationship between Twitter-based life satisfaction and happiness across U.S. counties in 2020.




### startFigure 2: Relationship between Twitter-based life satisfaction and happiness by rurality code (1–9) in 2020
df <- as.data.table(x_wide)[year==selected_year, .(lifesat, happiness, rural)]
df <- df[is.finite(lifesat) & is.finite(happiness) & is.finite(rural)]
df[, rural_f := factor(rural, levels = sort(unique(df$rural)))]
lims <- df[, .(
  xlo = quantile(lifesat,   0.01, na.rm = TRUE),
  xhi = quantile(lifesat,   0.99, na.rm = TRUE),
  ylo = quantile(happiness, 0.01, na.rm = TRUE),
  yhi = quantile(happiness, 0.99, na.rm = TRUE)
)]

ggplot(df, aes(lifesat, happiness, color = rural_f)) +
  geom_point(alpha = 0.25, size = 0.7, shape = 15) +        # squares
  geom_smooth(method = "lm", se = TRUE, color = "black",
              linewidth = 0.8, linetype = "dashed") +       # per-facet OLS
  coord_cartesian(xlim = c(lims$xlo, lims$xhi),
                  ylim = c(lims$ylo, lims$yhi)) +
  coord_equal() +
  facet_wrap(~ rural_f, ncol = 3) +
  # No scale_* here → inherits the SAME palette/mapping as your mod1 figure
  labs(
    title = sprintf("Life Satisfaction vs Happiness by Rural (%d)",selected_year),
    x = "Life satisfaction",
    y = "Happiness",
    color = "Rural code"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.minor = element_blank(),
    aspect.ratio = 1
  )

### end Figure 2: Relationship between Twitter-based life satisfaction and happiness by rurality code (1–9) in 2020


### start: model (1) logit and OLS version

## logit
modL1 <- glm(I(lifesat>0)  ~  rural  + margin + happiness
               , data = x_wide[lifesat_se>0 & happiness_se>0], 
               family = binomial(),
             weights =  1/(lifesat_se^2))

modL2 <- glm(I(lifesat>0)  ~ margin + rural  + I(acs5/1000) + happiness
               , data = x_wide[lifesat_se>0 & happiness_se>0], 
               family = binomial(),
             weights =  1/(lifesat_se^2))
modL3 <- glm(I(lifesat>0)  ~ margin + rural  + I(acs5/1000) + factor(year)+ happiness
               , data = x_wide[lifesat_se>0 & happiness_se>0], 
               family = binomial(),
             weights =  1/(lifesat_se^2))
modL4 <- glm(I(lifesat>0)  ~ margin * rural  + I(acs5/1000) + factor(year)+ happiness
               , data = x_wide[lifesat_se>0 & happiness_se>0], 
               family = binomial(),
             weights = 1/(lifesat_se^2))

## OLS
modL1_OLS <- lm(lifesat  ~  rural  + margin + happiness
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights =  1/(lifesat_se^2))

modL2_OLS <- lm(lifesat  ~ margin + rural  + I(acs5/1000) + happiness
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights =  1/(lifesat_se^2))

modL3_OLS <- lm(lifesat  ~ margin + rural  + I(acs5/1000) + factor(year)+ happiness
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights =  1/(lifesat_se^2))

modL4_OLS <- lm(lifesat  ~ margin * rural  + I(acs5/1000) + factor(year) + happiness
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights = 1/(lifesat_se^2))

### start Table 6:  Logit estimates of Pr(lifesat > 0) by county (2014–2022).
stargazer(modL1,modL2,modL3,modL4)
### end Table 6

### start Table 8: OLS estimates of lifesat by county (2014–2022).
stargazer(modL1_OLS,modL2_OLS,modL3_OLS,modL4_OLS)
### end Table 6


### start Table 2: model average margin effects (AME)
summary(margins(modL4))
### end Table 2




### start Table 4: Socioeconomic context for extreme counties (ACS 2022 5-year). Rates shown as percentages; last row reports U.S. average.
who <- x_wide[lifesat_se>0 & happiness_se>0]
# sort by year, then by rural (descending) and margin (descending)
setorder(who, year, -rural, -margin)
# extract State and County codes from StateCounty (first 2 and last 3 digits)
who[, `:=`(
  State = substr(StateCounty, 1, 2),
  County = substr(StateCounty, 3, 5)
)]

# subset of interest (rural > 7 and margin > 0.75)
extreme <- who[rural >= 7 & margin > 0.75,
           .(StateCounty, State, County, year, rural, margin, vtweets_max)]

extreme
fips <- unique(as.data.table(fips_codes)[, .(state_code, county_code, state_name, county)])
extreme_named <- merge(
  extreme,
  fips,
  by.x = c("State", "County"),
  by.y = c("state_code", "county_code"),
  all.x = TRUE
)


extreme_named[, .(state_name, county, year, rural, margin, vtweets_max)]
extreme_named[, score := rural * margin]
exDT <- copy(extreme_named)
exDT[, ex_score := rural * margin]
setorder(exDT, -ex_score, -vtweets_max)
exDT_dedup <- exDT[, .SD[1], by = .(state_name, county, year)]
setorder(exDT_dedup, -ex_score, -vtweets_max, -year)
ex_one_per_county <- exDT_dedup[, .SD[1], by = .(state_name, county)]

ex_out <- ex_one_per_county[order(state_name, county)][, .(
  Year   = year,
  State  = state_name,
  County = county,
  Rural_x_Margin = round(score, 3),
  Tweets = vtweets_max
)]

ex_out <- as.data.table(ex_out)
ex_named <- as.data.table(extreme_named)

if (!"StateCounty" %in% names(ex_out)) {
  ex_out <- merge(
    ex_out,
    unique(ex_named[, .(StateCounty, state_name, county)]),
    by.x = c("State","County"),
    by.y = c("state_name","county"),
    all.x = TRUE,
    sort  = FALSE
  )
}
ex_out[, StateCounty := sprintf("%05d", as.integer(StateCounty))]

if (!"Rural_x_Margin" %in% names(ex_out)) {
  if (all(c("Rural","Margin") %in% names(ex_out))) {
    ex_out[, Rural_x_Margin := Rural * Margin]
  }
}

ex_acs_vars <- c(
  poverty    = "B17001_002E",  # below poverty
  pop_pov    = "B17001_001E",  # poverty universe
  unemployed = "B23025_005E",
  laborforce = "B23025_003E",
  black      = "B02001_003E",
  native     = "B02001_004E",
  pop_total  = "B02001_001E",
  educ_total = "B15003_001E"
)
ex_educ_bach_cols <- paste0("B15003_", sprintf("%03dE", 22:25))  # BA+ cells

ex_vars_all <- unique(c(unname(ex_acs_vars), ex_educ_bach_cols))

# this requires internet connection
ex_acs <- get_acs(
  geography = "county",
  variables = ex_vars_all,
  year      = 2022
)

ex_acs_wide <- dcast(as.data.table(ex_acs), GEOID ~ variable, value.var = "estimate")
ex_acs_wide[, GEOID := as.character(GEOID)]
ex_acs_wide[, GEOID5 := ifelse(nchar(GEOID) >= 5, substr(GEOID, nchar(GEOID)-4, nchar(GEOID)), GEOID)]
ex_acs_wide[, GEOID5 := sprintf("%05d", as.integer(GEOID5))]

ex_resolve <- function(codeE, cols) {
  if (codeE %in% cols) return(codeE)
  code_noE <- sub("E$", "", codeE)
  if (code_noE %in% cols) return(code_noE)
  NA_character_
}
ex_cols <- names(ex_acs_wide)

ex_v <- list(
  pop_total  = ex_resolve("B02001_001E", ex_cols),
  black      = ex_resolve("B02001_003E", ex_cols),
  native     = ex_resolve("B02001_004E", ex_cols),
  laborforce = ex_resolve("B23025_003E", ex_cols),
  unemployed = ex_resolve("B23025_005E", ex_cols),
  pop_pov    = ex_resolve("B17001_001E", ex_cols),
  poverty    = ex_resolve("B17001_002E", ex_cols),
  educ_total = ex_resolve("B15003_001E", ex_cols)
)

ex_bach_candidates <- c(
  paste0("B15003_", sprintf("%03dE", 22:25)),
  paste0("B15003_", sprintf("%03d",  22:25))
)
ex_bach_present <- intersect(ex_bach_candidates, ex_cols)
ex_safe_div <- function(num, den) ifelse(den > 0, num / den, NA_real_)

if (length(ex_bach_present) > 0) {
  ex_acs_wide[, ex_bach_num := rowSums(.SD, na.rm = TRUE), .SDcols = ex_bach_present]
} else {
  ex_acs_wide[, ex_bach_num := NA_real_]
}

ex_acs_wide[, `:=`(
  ex_poverty_rate       = ex_safe_div(get(ex_v$poverty),    get(ex_v$pop_pov)),
  ex_unemployment_rate  = ex_safe_div(get(ex_v$unemployed), get(ex_v$laborforce)),
  ex_pct_black          = ex_safe_div(get(ex_v$black),      get(ex_v$pop_total)),
  ex_pct_native         = ex_safe_div(get(ex_v$native),     get(ex_v$pop_total)),
  ex_pct_bachelor       = ex_safe_div(ex_bach_num,          get(ex_v$educ_total))
)]

ex_stats <- merge(
  ex_out,
  ex_acs_wide[, .(
    StateCounty = GEOID5,
    ex_poverty_rate, ex_unemployment_rate, ex_pct_black, ex_pct_native, ex_pct_bachelor
  )],
  by   = "StateCounty",
  all.x = TRUE,
  sort  = FALSE
)

message("Matched ACS rows: ", sum(!is.na(ex_stats$ex_poverty_rate)), " / ", nrow(ex_stats))
if (any(is.na(ex_stats$ex_poverty_rate))) {
  message("Unmatched GEOIDs: ", paste(ex_stats[is.na(ex_poverty_rate), unique(StateCounty)], collapse = ", "))
}


ex_summary <- ex_stats[, .(
  mean_poverty   = mean(ex_poverty_rate,       na.rm = TRUE),
  mean_unemp     = mean(ex_unemployment_rate,  na.rm = TRUE),
  mean_black     = mean(ex_pct_black,          na.rm = TRUE),
  mean_native    = mean(ex_pct_native,         na.rm = TRUE),
  mean_bachelor  = mean(ex_pct_bachelor,       na.rm = TRUE)
)]
print(ex_summary)


ex_has_rural_margin <- all(c("Rural","Margin") %in% names(ex_stats))
if (ex_has_rural_margin) {
  ex_table <- ex_stats[, .(
    Year, State, County, Rural, Margin, Tweets,
    PovertyPct   = round(100 * ex_poverty_rate,       1),
    UnempPct     = round(100 * ex_unemployment_rate,  1),
    BlackPct     = round(100 * ex_pct_black,          1),
    NativePct    = round(100 * ex_pct_native,         1),
    BAplusPct    = round(100 * ex_pct_bachelor,       1)
  )][order(State, County)]
  ex_colnames <- c("Year","State","County","Rural","Margin","Max Tweets",
                   "Poverty %","Unemp %","Black %","Native %","BA+ %")
  ex_align <- c("c","l","l","c","c","r","r","r","r","r","r")
} else {
  ex_table <- ex_stats[, .(
    Year, State, County, Rural_x_Margin, Tweets,
    PovertyPct   = round(100 * ex_poverty_rate,       1),
    UnempPct     = round(100 * ex_unemployment_rate,  1),
    BlackPct     = round(100 * ex_pct_black,          1),
    NativePct    = round(100 * ex_pct_native,         1),
    BAplusPct    = round(100 * ex_pct_bachelor,       1)
  )][order(State, County)]
  ex_colnames <- c("Year","State","County","Rural×Margin","Max Tweets",
                   "Poverty %","Unemp %","Black %","Native %","BA+ %")
  ex_align <- c("c","l","l","c","r","r","r","r","r","r")
}


ex_acs_summary <- ex_acs_wide[, .(
  US_mean_poverty   = mean(ex_poverty_rate,       na.rm = TRUE),
  US_mean_unemp     = mean(ex_unemployment_rate,  na.rm = TRUE),
  US_mean_black     = mean(ex_pct_black,          na.rm = TRUE),
  US_mean_native    = mean(ex_pct_native,         na.rm = TRUE),
  US_mean_bachelor  = mean(ex_pct_bachelor,       na.rm = TRUE)
)]

ex_stats[, County := gsub("\\s+(County|Parish|Borough)$", "", County)]
setnames(ex_stats, old = "Tweets", new = "Tweets")
ex_table <- ex_stats[, .(
  Year, State, County, Rural_x_Margin,
  Tweets,
  PovertyPct   = round(100 * ex_poverty_rate,       1),
  UnempPct     = round(100 * ex_unemployment_rate,  1),
  BlackPct     = round(100 * ex_pct_black,          1),
  NativePct    = round(100 * ex_pct_native,         1),
  BAplusPct    = round(100 * ex_pct_bachelor,       1)
)][order(State, County)]

ex_us_row <- data.table(
  Year = NA_integer_,
  State = "U.S. average",
  County = "",
  Rural_x_Margin = NA_real_,
  Tweets = NA_real_,
  PovertyPct   = round(100 * ex_acs_summary$US_mean_poverty,   1),
  UnempPct     = round(100 * ex_acs_summary$US_mean_unemp,     1),
  BlackPct     = round(100 * ex_acs_summary$US_mean_black,     1),
  NativePct    = round(100 * ex_acs_summary$US_mean_native,    1),
  BAplusPct    = round(100 * ex_acs_summary$US_mean_bachelor,  1)
)
ex_table <- rbind(ex_table, ex_us_row, fill = TRUE)

ex_tab <- kbl(
  ex_table,
  format   = "latex",
  booktabs = TRUE,
  caption  = "Socioeconomic context for extreme counties (ACS 2022 5-year). Rates shown as percentages; last row reports U.S. average.",
  label    = "tab:ex_county_context",
  col.names= c("Year","State","County","Rural×Margin","Tweets",
               "Poverty %","Unemp %","Black %","Native %","BA+ %"),
  align    = c("c","l","l","c","r","r","r","r","r","r")
) |>
  kable_styling(latex_options = c("hold_position")) |>
  row_spec(nrow(ex_table), bold = TRUE)  # highlight the average row

save_kable(ex_tab, "ex_county_context.tex")
message("LaTeX saved to: ex_county_context.tex")

### end Table 4 ###



# generates plots by year
thr <- 0.10 
selected_year <- 2022
  
x2020 <- x_wide[year==selected_year]

mymodL <- modL4 # the full model specification

d20 <- as.data.table(x2020)[is.finite(margin) & is.finite(rural) & 
                              is.finite(acs5) & lifesat_se>0]
rural_levels <- sort(unique(d20$rural))
m_seq <- seq(min(d20$margin, na.rm = TRUE),
             max(d20$margin, na.rm = TRUE),
             length.out = 200)

# Hold ACS5 fixed at 2020 median
acs5_med20 <- median(d20$acs5, na.rm = TRUE)
ntw_med    <- median(d20$ntweets_max, na.rm = TRUE)
vtw_med    <- median(d20$vtweets_max, na.rm = TRUE)
hap_mean    <- median(d20$happiness, na.rm = TRUE)

# Prediction grid: margin × rural (acs5 fixed)
plotdat <- expand.grid(
  margin = m_seq,
  rural  = rural_levels,          # numeric in model
  acs5   = acs5_med20,
  year = selected_year,
  ntweets_max = ntw_med,
  vtweets_max = vtw_med,
  happiness = hap_mean
)

# Predict (link -> prob)
pred <- predict(mymodL, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)

pal <- hue_pal()(length(rural_levels)); names(pal) <- as.character(rural_levels)

p1 <- ggplot(plotdat, aes(x = margin, y = fit,
                    color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.18, color = NA) +
  geom_line(size = 1.15) +
  geom_vline(xintercept = 0, linetype = "dotted", linewidth = 0.5) +
  scale_y_continuous(labels = label_percent(accuracy = 1), limits = c(0, 1)) +
  scale_color_manual(values = pal, name = "Rural code", drop = FALSE) +
  scale_fill_manual(values  = pal, name = "Rural code", drop = FALSE) +
  labs(
    title = sprintf("Predicted P(lifesat > 0) across Vote Margin (Year = %d)",selected_year),
    subtitle = sprintf("ACS5 held at %d median = %s", selected_year, format(round(acs5_med20), big.mark=",")),
    x = "(Dem − GOP) Vote Margin",
    y = "P(lifesat > 0)"
  ) +
  theme_minimal(base_size = 14)
p1
ggsave(sprintf("lifesatVoteMedian_%d.pdf", selected_year), p1,width = 12,height = 6)

d20 <- as.data.table(x2020)[year == selected_year & is.finite(margin) & 
                              is.finite(rural) & is.finite(acs5) & lifesat_se>0]

rural_levels <- sort(unique(d20$rural))

acs5_lvls <- quantile(d20$acs5, probs = c(0.2, 0.5, 0.8), na.rm = TRUE)
acs5_lbls <- paste0(c("Low (20th) ~ ", "Median (50th) ~ ", "High (80th) ~ "),
                    format(round(acs5_lvls), big.mark=","))

m_seq <- seq(min(d20$margin, na.rm = TRUE),
             max(d20$margin, na.rm = TRUE),
             length.out = 200)

ntw_med <- median(d20$ntweets_max, na.rm = TRUE)
vtw_med <- median(d20$vtweets_max, na.rm = TRUE)
hap_med <- mean(d20$happiness, na.rm = TRUE)

plotdat <- expand.grid(
  margin = m_seq,
  rural  = rural_levels,             # numeric in model
  acs5   = as.numeric(acs5_lvls),
  KEEP.OUT.ATTRS = FALSE,
  stringsAsFactors = FALSE
)
plotdat$acs5_f <- factor(plotdat$acs5,
                         levels = as.numeric(acs5_lvls),
                         labels = acs5_lbls)
plotdat$year <- selected_year
plotdat$ntweets_max <- ntw_med
plotdat$vtweets_max <- vtw_med
plotdat$happiness <- hap_med

pred <- predict(mymodL, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)

pal <- scales::hue_pal()(length(rural_levels)); names(pal) <- as.character(rural_levels)

p2<- ggplot(plotdat, aes(x = margin, y = fit,
                    color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.18, color = NA) +
  geom_line(size = 1.15) +
  geom_vline(xintercept = 0, linetype = "dotted", linewidth = 0.5) +
  facet_wrap(~ acs5_f, nrow = 1) +
  scale_y_continuous(labels = label_percent(accuracy = 1), limits = c(0, 1)) +
  scale_color_manual(values = pal, name = "Rural code", drop = FALSE) +
  scale_fill_manual(values  = pal, name = "Rural code", drop = FALSE) +
  labs(
    title = "Predicted P(lifesat > 0) across Vote Margin",
    subtitle = "ACS 5-year median household income (20th / 50th / 80th pct)",
    x = sprintf("(Dem − GOP) Vote Margin %d",selected_year),
    y = "P(lifesat > 0)"
  ) +
  theme_minimal(base_size = 14)
p2

ggsave(sprintf("lifesatVotebyClass_%d.pdf", selected_year), p2,width = 12,height = 6)

x2020[, margin_class :=
        fifelse(!is.na(margin) & margin <= -thr, "GOP-leaning (≤ -10pp)",
                fifelse(!is.na(margin) & margin >=  thr, "DEM-leaning (≥ +10pp)", "Toss-up (±10pp)"))]

d20 <- as.data.table(x2020)[ is.finite(margin) & is.finite(rural) &
                               is.finite(acs5) & lifesat_se>0]

d20[, margin_class := factor(margin_class,
                             levels = c("GOP-leaning (≤ -10pp)", "Toss-up (±10pp)", "DEM-leaning (≥ +10pp)")
)]


m_centers <- d20[, .(margin_center = mean(margin, na.rm = TRUE)), by = margin_class][order(margin_class)]

acs_seq      <- seq(min(d20$acs5, na.rm = TRUE), max(d20$acs5, na.rm = TRUE), length.out = 200)
rural_levels <- sort(unique(d20$rural))
ntw_med <- median(d20$ntweets_max, na.rm = TRUE)
vtw_med <- median(d20$vtweets_max, na.rm = TRUE)
hap_med <- median(d20$happiness, na.rm = TRUE)

plotdat <- rbindlist(lapply(seq_len(nrow(m_centers)), function(i) {
  expand.grid(
    acs5         = acs_seq,
    rural        = rural_levels,
    margin       = m_centers$margin_center[i],
    margin_class = m_centers$margin_class[i],
    KEEP.OUT.ATTRS = FALSE, 
    stringsAsFactors = FALSE
  )
}), use.names = TRUE)
plotdat$year <- selected_year
plotdat$ntweets_max <- ntw_med
plotdat$vtweets_max <- vtw_med
plotdat$happiness <- hap_med


pred <- predict(mymodL, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)


pal <- hue_pal()(length(rural_levels)); names(pal) <- as.character(rural_levels)

p3 <- ggplot(plotdat, aes(x = acs5, y = fit,
                    color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.18, color = NA) +
  geom_line(size = 1.1) +
  facet_wrap(~ margin_class, nrow = 1) +
  scale_x_continuous(labels = label_number(big.mark = ",")) +
  scale_y_continuous(labels = label_percent(accuracy = 1), limits = c(0, 1)) +
  scale_color_manual(values = pal, name = "Rural code", drop = FALSE) +
  scale_fill_manual(values  = pal, name = "Rural code", drop = FALSE) +
  labs(
    x = "ACS 5-year Median Household Income (USD)",
    y = "P(lifesat > 0)",
    color = "Rural code",
    fill  = "Rural code",
    title = sprintf("Predicted P(lifesat > 0) across ACS5 income levels by margin class (%d)",selected_year),
    subtitle = "Within each facet, margin is held at the class mean"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")
p3

ggsave(sprintf("lifesatACSbyVote_%d.pdf", selected_year), p3,width = 12,height = 6)

### end plots; end lifesat modeling ###


### HAPPINESS ###

### login and OLS models.

### model (2) logit
modH1 <- glm(I(happiness>0)  ~  rural  + margin +lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             family = binomial(),
             weights = 1/(happiness_se)^2)

modH2 <- glm(I(happiness>0)  ~ margin + rural  + I(acs5/1000) +lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             family = binomial(),
             weights = 1/(happiness_se)^2)

modH3 <- glm(I(happiness>0)  ~ margin + rural  + I(acs5/1000) + factor(year)+lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             family = binomial(),
             weights = 1/(happiness_se)^2)

modH4 <- glm(I(happiness>0)  ~ margin * rural  + I(acs5/1000) + factor(year)+lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             family = binomial(),
             weights = 1/(happiness_se)^2)

### OLS models
modH1_OLS <- lm(happiness  ~  rural  + margin +lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights = 1/(happiness_se)^2)

modH2_OLS <- lm(happiness ~ margin + rural  + I(acs5/1000) +lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights = 1/(happiness_se)^2)

modH3_OLS <- lm(happiness  ~ margin + rural  + I(acs5/1000) + factor(year)+lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights = 1/(happiness_se)^2)

modH4_OLS <- lm(happiness  ~ margin * rural  + I(acs5/1000) + factor(year) +lifesat
             , data = x_wide[lifesat_se>0 & happiness_se>0], 
             weights = 1/(happiness_se)^2)

### start Table 7
stargazer(modH1,modH2,modH3,modH4)
### end Table 7

### start Table 9
stargazer(modH1_OLS,modH2_OLS,modH3_OLS,modH4_OLS)
### end Table 9

### start AME for happiness
summary(margins(modH4))
### end AME for happiness


round_dfH <- as.data.frame(summary(margins(modH4)))
print(xtable(round_dfH, digits = c(0, 3, 4, 4, 2, 3, 3, 3)), include.rownames = FALSE)

### plots for happiness
selected_year <- 2022

x2020 <- x_wide[year == selected_year & happiness_se>0 & lifesat_se>0]

mymodH <- modH4  # full model

acs5_med <- median(x2020$acs5, na.rm = TRUE)
ntw_med <- median(x2020$ntweets_max, na.rm = TRUE)
vtw_med <- median(x2020$vtweets_max, na.rm = TRUE)
sat_med <- median(x2020$lifesat, na.rm = TRUE)

plotdat <- expand.grid(
  margin = seq(min(x2020$margin, na.rm = TRUE),
               max(x2020$margin, na.rm = TRUE),
               length.out = 200),
  rural  = sort(unique(x2020$rural)),   # numeric
  acs5   = acs5_med,
  year = selected_year,
  vtweets_max = vtw_med,
  lifesat = sat_med
)

pred <- predict(mymodH, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)

ggplot(plotdat, aes(x = margin, y = fit,
                    color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.15, color = NA) +
  geom_line(size = 1.2) +
  labs(x = sprintf("(Dem − GOP) %d Vote Margin",selected_year),
       y = "P(happiness > 0)",
       color = "Rural code",
       fill  = "Rural code",
       subtitle = sprintf("ACS5 held at median = %s", format(round(acs5_med), big.mark=","))) +
  theme_minimal(base_size = 14)



d20 <- as.data.table(x2020)[ is.finite(margin) & is.finite(rural) & 
                               is.finite(acs5)  & lifesat_se>0 & happiness_se>0]


rural_levels <- sort(unique(d20$rural))

acs5_lvls <- quantile(d20$acs5, probs = c(0.2, 0.5, 0.8), na.rm = TRUE)
acs5_lbls <- paste0(c("Low (20th) ~ ", "Median (50th) ~ ", "High (80th) ~ "),
                    format(round(acs5_lvls), big.mark=","))


m_seq <- seq(min(d20$margin, na.rm = TRUE),
             max(d20$margin, na.rm = TRUE),
             length.out = 200)
ntw_med <- median(d20$vtweets_max, na.rm = TRUE)
sat_med <- median(d20$lifesat, na.rm = TRUE)


plotdat <- expand.grid(
  margin = m_seq,
  rural  = rural_levels,             # numeric in model
  acs5   = as.numeric(acs5_lvls)
)
plotdat$acs5_f <- factor(plotdat$acs5,
                         levels = as.numeric(acs5_lvls),
                         labels = acs5_lbls)

plotdat$year <- selected_year
plotdat$vtweets_max <- vtw_med
plotdat$lifesat <- sat_med


pred <- predict(mymodH, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)


pal <- scales::hue_pal()(length(rural_levels)); names(pal) <- as.character(rural_levels)

ppp<- ggplot(plotdat, aes(x = margin, y = fit,
                    color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.18, color = NA) +
  geom_line(size = 1.15) +
  geom_vline(xintercept = 0, linetype = "dotted", linewidth = 0.5) +
  facet_wrap(~ acs5_f, nrow = 1) +
  scale_y_continuous(labels = label_percent(accuracy = 1), limits = c(0.9, 1)) +
  scale_color_manual(values = pal, name = "Rural code", drop = FALSE) +
  scale_fill_manual(values  = pal, name = "Rural code", drop = FALSE) +
  labs(
    title = "Predicted P(happiness > 0) across Vote Margin",
    subtitle = "ACS 5-year median household income (20th / 50th / 80th pct)",
    x = sprintf("(Dem − GOP) Vote Margin %d",selected_year),
    y = "P(happiness > 0)"
  ) +
  theme_minimal(base_size = 14)
ppp
 ggsave(sprintf("HappinessByACS5%d.pdf", selected_year), ppp,width = 9,height = 6)


thr <- 0.10  # 10 percentage points = 0.10 (Dem − GOP)
d20 <- x2020[, margin_class :=
        fifelse(!is.na(margin) & margin <= -thr, "GOP-leaning (≤ -10pp)",
                fifelse(!is.na(margin) & margin >=  thr, "DEM-leaning (≥ +10pp)", "Toss-up (±10pp)"))]


d20[, margin_class := factor(margin_class,
                                levels = c("GOP-leaning (≤ -10pp)", "Toss-up (±10pp)", "DEM-leaning (≥ +10pp)"))]

d20 <- d20[is.finite(margin) & is.finite(rural) & 
             is.finite(acs5) & happiness_se>0 & lifesat_se>0, ]

m_centers <- d20[, .(margin_center = mean(margin, na.rm = TRUE)), by = margin_class]

acs_seq <- seq(min(d20$acs5, na.rm = TRUE),
               max(d20$acs5, na.rm = TRUE),
               length.out = 200)

rural_levels <- sort(unique(d20$rural))  # numeric

plotdat <- rbindlist(lapply(seq_len(nrow(m_centers)), function(i) {
  expand.grid(
    acs5   = acs_seq,
    rural  = rural_levels,
    margin = m_centers$margin_center[i],
    margin_class = m_centers$margin_class[i],
    KEEP.OUT.ATTRS = FALSE, 
    stringsAsFactors = FALSE
  )
}), use.names = TRUE)

plotdat$year <- selected_year
ntw_med <- median(d20$ntweets_max, na.rm = TRUE)
vtw_med <- median(d20$vtweets_max, na.rm = TRUE)
sat_med <- median(d20$lifesat, na.rm = TRUE)
plotdat$ntweets_max <- ntw_med
plotdat$vtweets_max <- vtw_med
plotdat$lifesat <- sat_med


pred <- predict(mymodH, newdata = plotdat, type = "link", se.fit = TRUE)
plotdat$fit <- plogis(pred$fit)
plotdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
plotdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)


ppp2 <- ggplot(plotdat, aes(x = acs5, y = fit, color = factor(rural), fill = factor(rural))) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, group = factor(rural)),
              alpha = 0.18, color = NA) +
  geom_line(size = 1.1) +
  facet_wrap(~ margin_class, nrow = 1) +
  scale_x_continuous(labels = label_number(big.mark = ",")) +
  scale_y_continuous(labels = label_percent(accuracy = 1), limits = c(0.9, 1)) +
  labs(
    x = "ACS 5-year Median Household Income (USD)",
    y = "P(happiness > 0)",
    color = "Rural code",
    fill  = "Rural code",
    title = "Predicted P(happiness>0) across ACS5 income levels by margin class",
    subtitle = "Within each facet, margin is held at the class mean"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")

ppp2
ggsave(sprintf("HappinessByMargin%d.pdf", selected_year), ppp2,width = 9,height = 6)

### end happiness plots

### end script

